#ifndef CUFFTDX_FFT_2401_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_2401_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<378, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<626>;
.reg .b32 r<27>;
.reg .b64 rd<17>;
mov.u32 r1, %tid.y;
mov.u32 r2, %14;
mad.lo.s32 r3, r1, 19208, r2;
mov.u32 r4, %tid.x;
add.f32 f29, %20, %34;
add.f32 f30, %18, f29;
add.f32 f31, %23, %31;
add.f32 f32, f31, f30;
add.f32 f33, %26, %28;
add.f32 f34, %22, %35;
add.f32 f35, %19, f34;
add.f32 f36, %25, %33;
add.f32 f37, f36, f35;
add.f32 f38, %27, %30;
fma.rn.f32 f39, f29, 0f3F1F9D07, %18;
mul.f32 f40, f31, 0f3E63DC87;
sub.f32 f41, f39, f40;
mul.f32 f42, f33, 0f3F66A5E5;
sub.f32 f43, f41, f42;
sub.f32 f44, %22, %35;
mul.f32 f45, f44, 0f3F48261C;
sub.f32 f46, %25, %33;
fma.rn.f32 f47, f46, 0f3F7994E0, f45;
sub.f32 f48, %27, %30;
fma.rn.f32 f49, f48, 0f3EDE2602, f47;
sub.f32 f50, f43, f49;
add.f32 f51, f49, f43;
mul.f32 f52, f29, 0f3E63DC87;
sub.f32 f53, %18, f52;
mul.f32 f54, f31, 0f3F66A5E5;
sub.f32 f55, f53, f54;
fma.rn.f32 f56, f33, 0f3F1F9D07, f55;
mul.f32 f57, f44, 0f3F7994E0;
mul.f32 f58, f46, 0f3EDE2602;
sub.f32 f59, f57, f58;
mul.f32 f60, f48, 0f3F48261C;
sub.f32 f61, f59, f60;
sub.f32 f62, f56, f61;
add.f32 f63, f61, f56;
mul.f32 f64, f29, 0f3F66A5E5;
sub.f32 f65, %18, f64;
fma.rn.f32 f66, f31, 0f3F1F9D07, f65;
mul.f32 f67, f33, 0f3E63DC87;
sub.f32 f68, f66, f67;
mul.f32 f69, f44, 0f3EDE2602;
mul.f32 f70, f46, 0f3F48261C;
sub.f32 f71, f69, f70;
fma.rn.f32 f72, f48, 0f3F7994E0, f71;
sub.f32 f73, f68, f72;
add.f32 f74, f72, f68;
fma.rn.f32 f75, f34, 0f3F1F9D07, %19;
mul.f32 f76, f36, 0f3E63DC87;
sub.f32 f77, f75, f76;
mul.f32 f78, f38, 0f3F66A5E5;
sub.f32 f79, f77, f78;
sub.f32 f80, %20, %34;
mul.f32 f81, f80, 0f3F48261C;
sub.f32 f82, %23, %31;
fma.rn.f32 f83, f82, 0f3F7994E0, f81;
sub.f32 f84, %26, %28;
fma.rn.f32 f85, f84, 0f3EDE2602, f83;
add.f32 f86, f85, f79;
sub.f32 f87, f79, f85;
mul.f32 f88, f34, 0f3E63DC87;
sub.f32 f89, %19, f88;
mul.f32 f90, f36, 0f3F66A5E5;
sub.f32 f91, f89, f90;
fma.rn.f32 f92, f38, 0f3F1F9D07, f91;
mul.f32 f93, f80, 0f3F7994E0;
mul.f32 f94, f82, 0f3EDE2602;
sub.f32 f95, f93, f94;
mul.f32 f96, f84, 0f3F48261C;
sub.f32 f97, f95, f96;
add.f32 f98, f97, f92;
sub.f32 f99, f92, f97;
mul.f32 f100, f34, 0f3F66A5E5;
sub.f32 f101, %19, f100;
fma.rn.f32 f102, f36, 0f3F1F9D07, f101;
mul.f32 f103, f38, 0f3E63DC87;
sub.f32 f104, f102, f103;
mul.f32 f105, f80, 0f3EDE2602;
mul.f32 f106, f82, 0f3F48261C;
sub.f32 f107, f105, f106;
fma.rn.f32 f108, f84, 0f3F7994E0, f107;
add.f32 f109, f108, f104;
sub.f32 f110, f104, f108;
mul.wide.u32 rd2, r4, -1089394037;
shr.u64 rd3, rd2, 40;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 343;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 19208, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %15;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f111, f112}, [rd6];
mul.f32 f115, f86, f112;
mul.f32 f116, f50, f112;
mul.f32 f117, f111, f86;
mul.f32 f118, f111, f111;
mul.f32 f119, f112, f112;
sub.f32 f120, f118, f119;
mul.f32 f121, f112, f111;
fma.rn.f32 f122, f112, f111, f121;
mul.f32 f123, f98, f122;
mul.f32 f124, f62, f122;
mul.f32 f125, f120, f98;
mul.f32 f126, f111, f120;
mul.f32 f127, f112, f122;
sub.f32 f128, f126, f127;
mul.f32 f129, f111, f122;
fma.rn.f32 f130, f112, f120, f129;
mul.f32 f131, f109, f130;
mul.f32 f132, f73, f130;
mul.f32 f133, f128, f109;
mul.f32 f134, f111, f128;
mul.f32 f135, f112, f130;
sub.f32 f136, f134, f135;
mul.f32 f137, f111, f130;
fma.rn.f32 f138, f112, f128, f137;
mul.f32 f139, f110, f138;
mul.f32 f140, f74, f138;
mul.f32 f141, f136, f110;
mul.f32 f142, f111, f136;
mul.f32 f143, f112, f138;
sub.f32 f144, f142, f143;
mul.f32 f145, f111, f138;
fma.rn.f32 f146, f112, f136, f145;
mul.f32 f147, f99, f146;
mul.f32 f148, f63, f146;
mul.f32 f149, f144, f99;
mul.f32 f150, f111, f144;
mul.f32 f151, f112, f146;
sub.f32 f152, f150, f151;
mul.f32 f153, f111, f146;
fma.rn.f32 f154, f112, f144, f153;
mul.f32 f155, f87, f154;
mul.f32 f156, f51, f154;
mul.f32 f157, f152, f87;
barrier.sync 0;
mad.lo.s32 r9, r7, 56, r8;
add.f32 f158, f38, f37;
add.f32 f159, f33, f32;
st.shared.v2.f32 [r9], {f159, f158};
fma.rn.f32 f160, f111, f50, f115;
sub.f32 f161, f117, f116;
st.shared.v2.f32 [r9+8], {f160, f161};
fma.rn.f32 f162, f120, f62, f123;
sub.f32 f163, f125, f124;
st.shared.v2.f32 [r9+16], {f162, f163};
sub.f32 f164, f133, f132;
fma.rn.f32 f165, f128, f73, f131;
st.shared.v2.f32 [r9+24], {f165, f164};
fma.rn.f32 f166, f136, f74, f139;
sub.f32 f167, f141, f140;
st.shared.v2.f32 [r9+32], {f166, f167};
fma.rn.f32 f168, f144, f63, f147;
sub.f32 f169, f149, f148;
st.shared.v2.f32 [r9+40], {f168, f169};
fma.rn.f32 f170, f152, f51, f155;
sub.f32 f171, f157, f156;
st.shared.v2.f32 [r9+48], {f170, f171};
barrier.sync 0;
mad.lo.s32 r10, r7, -48, r9;
ld.shared.v2.f32 {f172, f173}, [r10];
ld.shared.v2.f32 {f176, f177}, [r10+2744];
ld.shared.v2.f32 {f180, f181}, [r10+5488];
ld.shared.v2.f32 {f184, f185}, [r10+8232];
ld.shared.v2.f32 {f188, f189}, [r10+10976];
ld.shared.v2.f32 {f192, f193}, [r10+13720];
ld.shared.v2.f32 {f196, f197}, [r10+16464];
add.f32 f200, f176, f196;
add.f32 f201, f172, f200;
add.f32 f202, f180, f192;
add.f32 f203, f202, f201;
add.f32 f204, f184, f188;
add.f32 f205, f177, f197;
add.f32 f206, f173, f205;
add.f32 f207, f181, f193;
add.f32 f208, f207, f206;
add.f32 f209, f185, f189;
fma.rn.f32 f210, f200, 0f3F1F9D07, f172;
mul.f32 f211, f202, 0f3E63DC87;
sub.f32 f212, f210, f211;
mul.f32 f213, f204, 0f3F66A5E5;
sub.f32 f214, f212, f213;
sub.f32 f215, f177, f197;
mul.f32 f216, f215, 0f3F48261C;
sub.f32 f217, f181, f193;
fma.rn.f32 f218, f217, 0f3F7994E0, f216;
sub.f32 f219, f185, f189;
fma.rn.f32 f220, f219, 0f3EDE2602, f218;
sub.f32 f221, f214, f220;
add.f32 f222, f220, f214;
mul.f32 f223, f200, 0f3E63DC87;
sub.f32 f224, f172, f223;
mul.f32 f225, f202, 0f3F66A5E5;
sub.f32 f226, f224, f225;
fma.rn.f32 f227, f204, 0f3F1F9D07, f226;
mul.f32 f228, f215, 0f3F7994E0;
mul.f32 f229, f217, 0f3EDE2602;
sub.f32 f230, f228, f229;
mul.f32 f231, f219, 0f3F48261C;
sub.f32 f232, f230, f231;
sub.f32 f233, f227, f232;
add.f32 f234, f232, f227;
mul.f32 f235, f200, 0f3F66A5E5;
sub.f32 f236, f172, f235;
fma.rn.f32 f237, f202, 0f3F1F9D07, f236;
mul.f32 f238, f204, 0f3E63DC87;
sub.f32 f239, f237, f238;
mul.f32 f240, f215, 0f3EDE2602;
mul.f32 f241, f217, 0f3F48261C;
sub.f32 f242, f240, f241;
fma.rn.f32 f243, f219, 0f3F7994E0, f242;
sub.f32 f244, f239, f243;
add.f32 f245, f243, f239;
fma.rn.f32 f246, f205, 0f3F1F9D07, f173;
mul.f32 f247, f207, 0f3E63DC87;
sub.f32 f248, f246, f247;
mul.f32 f249, f209, 0f3F66A5E5;
sub.f32 f250, f248, f249;
sub.f32 f251, f176, f196;
mul.f32 f252, f251, 0f3F48261C;
sub.f32 f253, f180, f192;
fma.rn.f32 f254, f253, 0f3F7994E0, f252;
sub.f32 f255, f184, f188;
fma.rn.f32 f256, f255, 0f3EDE2602, f254;
add.f32 f257, f256, f250;
sub.f32 f258, f250, f256;
mul.f32 f259, f205, 0f3E63DC87;
sub.f32 f260, f173, f259;
mul.f32 f261, f207, 0f3F66A5E5;
sub.f32 f262, f260, f261;
fma.rn.f32 f263, f209, 0f3F1F9D07, f262;
mul.f32 f264, f251, 0f3F7994E0;
mul.f32 f265, f253, 0f3EDE2602;
sub.f32 f266, f264, f265;
mul.f32 f267, f255, 0f3F48261C;
sub.f32 f268, f266, f267;
add.f32 f269, f268, f263;
sub.f32 f270, f263, f268;
mul.f32 f271, f205, 0f3F66A5E5;
sub.f32 f272, f173, f271;
fma.rn.f32 f273, f207, 0f3F1F9D07, f272;
mul.f32 f274, f209, 0f3E63DC87;
sub.f32 f275, f273, f274;
mul.f32 f276, f251, 0f3EDE2602;
mul.f32 f277, f253, 0f3F48261C;
sub.f32 f278, f276, f277;
fma.rn.f32 f279, f255, 0f3F7994E0, f278;
add.f32 f280, f279, f275;
sub.f32 f281, f275, f279;
mul.wide.u32 rd7, r7, 613566757;
shr.u64 rd8, rd7, 32;
cvt.u32.u64 r11, rd8;
sub.s32 r12, r7, r11;
shr.u32 r13, r12, 1;
add.s32 r14, r13, r11;
shr.u32 r15, r14, 2;
mul.lo.s32 r16, r15, 7;
sub.s32 r17, r7, r16;
mul.wide.u32 rd9, r15, 8;
mov.u64 rd10, %16;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f282, f283}, [rd11];
mul.f32 f286, f257, f283;
mul.f32 f287, f221, f283;
mul.f32 f288, f282, f257;
mul.f32 f289, f282, f282;
mul.f32 f290, f283, f283;
sub.f32 f291, f289, f290;
mul.f32 f292, f283, f282;
fma.rn.f32 f293, f283, f282, f292;
mul.f32 f294, f269, f293;
mul.f32 f295, f233, f293;
mul.f32 f296, f291, f269;
mul.f32 f297, f282, f291;
mul.f32 f298, f283, f293;
sub.f32 f299, f297, f298;
mul.f32 f300, f282, f293;
fma.rn.f32 f301, f283, f291, f300;
mul.f32 f302, f280, f301;
mul.f32 f303, f244, f301;
mul.f32 f304, f299, f280;
mul.f32 f305, f282, f299;
mul.f32 f306, f283, f301;
sub.f32 f307, f305, f306;
mul.f32 f308, f282, f301;
fma.rn.f32 f309, f283, f299, f308;
mul.f32 f310, f281, f309;
mul.f32 f311, f245, f309;
mul.f32 f312, f307, f281;
mul.f32 f313, f282, f307;
mul.f32 f314, f283, f309;
sub.f32 f315, f313, f314;
mul.f32 f316, f282, f309;
fma.rn.f32 f317, f283, f307, f316;
mul.f32 f318, f270, f317;
mul.f32 f319, f234, f317;
mul.f32 f320, f315, f270;
mul.f32 f321, f282, f315;
mul.f32 f322, f283, f317;
sub.f32 f323, f321, f322;
mul.f32 f324, f282, f317;
fma.rn.f32 f325, f283, f315, f324;
mul.f32 f326, f258, f325;
mul.f32 f327, f222, f325;
mul.f32 f328, f323, f258;
shl.b32 r18, r17, 3;
add.s32 r19, r8, r18;
barrier.sync 0;
mad.lo.s32 r20, r15, 392, r19;
add.f32 f329, f209, f208;
add.f32 f330, f204, f203;
st.shared.v2.f32 [r20], {f330, f329};
fma.rn.f32 f331, f282, f221, f286;
sub.f32 f332, f288, f287;
st.shared.v2.f32 [r20+56], {f331, f332};
fma.rn.f32 f333, f291, f233, f294;
sub.f32 f334, f296, f295;
st.shared.v2.f32 [r20+112], {f333, f334};
fma.rn.f32 f335, f299, f244, f302;
sub.f32 f336, f304, f303;
st.shared.v2.f32 [r20+168], {f335, f336};
sub.f32 f337, f312, f311;
fma.rn.f32 f338, f307, f245, f310;
st.shared.v2.f32 [r20+224], {f338, f337};
fma.rn.f32 f339, f315, f234, f318;
sub.f32 f340, f320, f319;
st.shared.v2.f32 [r20+280], {f339, f340};
fma.rn.f32 f341, f323, f222, f326;
sub.f32 f342, f328, f327;
st.shared.v2.f32 [r20+336], {f341, f342};
barrier.sync 0;
ld.shared.v2.f32 {f343, f344}, [r10];
ld.shared.v2.f32 {f347, f348}, [r10+2744];
ld.shared.v2.f32 {f351, f352}, [r10+5488];
ld.shared.v2.f32 {f355, f356}, [r10+8232];
ld.shared.v2.f32 {f359, f360}, [r10+10976];
ld.shared.v2.f32 {f363, f364}, [r10+13720];
ld.shared.v2.f32 {f367, f368}, [r10+16464];
add.f32 f371, f347, f367;
add.f32 f372, f343, f371;
add.f32 f373, f351, f363;
add.f32 f374, f373, f372;
add.f32 f375, f355, f359;
add.f32 f376, f348, f368;
add.f32 f377, f344, f376;
add.f32 f378, f352, f364;
add.f32 f379, f378, f377;
add.f32 f380, f356, f360;
fma.rn.f32 f381, f371, 0f3F1F9D07, f343;
mul.f32 f382, f373, 0f3E63DC87;
sub.f32 f383, f381, f382;
mul.f32 f384, f375, 0f3F66A5E5;
sub.f32 f385, f383, f384;
sub.f32 f386, f348, f368;
mul.f32 f387, f386, 0f3F48261C;
sub.f32 f388, f352, f364;
fma.rn.f32 f389, f388, 0f3F7994E0, f387;
sub.f32 f390, f356, f360;
fma.rn.f32 f391, f390, 0f3EDE2602, f389;
sub.f32 f392, f385, f391;
add.f32 f393, f391, f385;
mul.f32 f394, f371, 0f3E63DC87;
sub.f32 f395, f343, f394;
mul.f32 f396, f373, 0f3F66A5E5;
sub.f32 f397, f395, f396;
fma.rn.f32 f398, f375, 0f3F1F9D07, f397;
mul.f32 f399, f386, 0f3F7994E0;
mul.f32 f400, f388, 0f3EDE2602;
sub.f32 f401, f399, f400;
mul.f32 f402, f390, 0f3F48261C;
sub.f32 f403, f401, f402;
sub.f32 f404, f398, f403;
add.f32 f405, f403, f398;
mul.f32 f406, f371, 0f3F66A5E5;
sub.f32 f407, f343, f406;
fma.rn.f32 f408, f373, 0f3F1F9D07, f407;
mul.f32 f409, f375, 0f3E63DC87;
sub.f32 f410, f408, f409;
mul.f32 f411, f386, 0f3EDE2602;
mul.f32 f412, f388, 0f3F48261C;
sub.f32 f413, f411, f412;
fma.rn.f32 f414, f390, 0f3F7994E0, f413;
sub.f32 f415, f410, f414;
add.f32 f416, f414, f410;
fma.rn.f32 f417, f376, 0f3F1F9D07, f344;
mul.f32 f418, f378, 0f3E63DC87;
sub.f32 f419, f417, f418;
mul.f32 f420, f380, 0f3F66A5E5;
sub.f32 f421, f419, f420;
sub.f32 f422, f347, f367;
mul.f32 f423, f422, 0f3F48261C;
sub.f32 f424, f351, f363;
fma.rn.f32 f425, f424, 0f3F7994E0, f423;
sub.f32 f426, f355, f359;
fma.rn.f32 f427, f426, 0f3EDE2602, f425;
add.f32 f428, f427, f421;
sub.f32 f429, f421, f427;
mul.f32 f430, f376, 0f3E63DC87;
sub.f32 f431, f344, f430;
mul.f32 f432, f378, 0f3F66A5E5;
sub.f32 f433, f431, f432;
fma.rn.f32 f434, f380, 0f3F1F9D07, f433;
mul.f32 f435, f422, 0f3F7994E0;
mul.f32 f436, f424, 0f3EDE2602;
sub.f32 f437, f435, f436;
mul.f32 f438, f426, 0f3F48261C;
sub.f32 f439, f437, f438;
add.f32 f440, f439, f434;
sub.f32 f441, f434, f439;
mul.f32 f442, f376, 0f3F66A5E5;
sub.f32 f443, f344, f442;
fma.rn.f32 f444, f378, 0f3F1F9D07, f443;
mul.f32 f445, f380, 0f3E63DC87;
sub.f32 f446, f444, f445;
mul.f32 f447, f422, 0f3EDE2602;
mul.f32 f448, f424, 0f3F48261C;
sub.f32 f449, f447, f448;
fma.rn.f32 f450, f426, 0f3F7994E0, f449;
add.f32 f451, f450, f446;
sub.f32 f452, f446, f450;
mul.wide.u32 rd12, r7, 1402438301;
shr.u64 rd13, rd12, 36;
cvt.u32.u64 r21, rd13;
mul.lo.s32 r22, r21, 49;
sub.s32 r23, r7, r22;
mul.wide.u32 rd14, r21, 8;
mov.u64 rd15, %17;
add.s64 rd16, rd15, rd14;
ld.global.v2.f32 {f453, f454}, [rd16];
mul.f32 f457, f428, f454;
mul.f32 f458, f392, f454;
mul.f32 f459, f453, f428;
mul.f32 f460, f453, f453;
mul.f32 f461, f454, f454;
sub.f32 f462, f460, f461;
mul.f32 f463, f454, f453;
fma.rn.f32 f464, f454, f453, f463;
mul.f32 f465, f440, f464;
mul.f32 f466, f404, f464;
mul.f32 f467, f462, f440;
mul.f32 f468, f453, f462;
mul.f32 f469, f454, f464;
sub.f32 f470, f468, f469;
mul.f32 f471, f453, f464;
fma.rn.f32 f472, f454, f462, f471;
mul.f32 f473, f451, f472;
mul.f32 f474, f415, f472;
mul.f32 f475, f470, f451;
mul.f32 f476, f453, f470;
mul.f32 f477, f454, f472;
sub.f32 f478, f476, f477;
mul.f32 f479, f453, f472;
fma.rn.f32 f480, f454, f470, f479;
mul.f32 f481, f452, f480;
mul.f32 f482, f416, f480;
mul.f32 f483, f478, f452;
mul.f32 f484, f453, f478;
mul.f32 f485, f454, f480;
sub.f32 f486, f484, f485;
mul.f32 f487, f453, f480;
fma.rn.f32 f488, f454, f478, f487;
mul.f32 f489, f441, f488;
mul.f32 f490, f405, f488;
mul.f32 f491, f486, f441;
mul.f32 f492, f453, f486;
mul.f32 f493, f454, f488;
sub.f32 f494, f492, f493;
mul.f32 f495, f453, f488;
fma.rn.f32 f496, f454, f486, f495;
mul.f32 f497, f429, f496;
mul.f32 f498, f393, f496;
mul.f32 f499, f494, f429;
shl.b32 r24, r23, 3;
add.s32 r25, r8, r24;
barrier.sync 0;
mad.lo.s32 r26, r21, 2744, r25;
add.f32 f500, f380, f379;
add.f32 f501, f375, f374;
st.shared.v2.f32 [r26], {f501, f500};
fma.rn.f32 f502, f453, f392, f457;
sub.f32 f503, f459, f458;
st.shared.v2.f32 [r26+392], {f502, f503};
fma.rn.f32 f504, f462, f404, f465;
sub.f32 f505, f467, f466;
st.shared.v2.f32 [r26+784], {f504, f505};
fma.rn.f32 f506, f470, f415, f473;
sub.f32 f507, f475, f474;
st.shared.v2.f32 [r26+1176], {f506, f507};
fma.rn.f32 f508, f478, f416, f481;
sub.f32 f509, f483, f482;
st.shared.v2.f32 [r26+1568], {f508, f509};
sub.f32 f510, f491, f490;
fma.rn.f32 f511, f486, f405, f489;
st.shared.v2.f32 [r26+1960], {f511, f510};
fma.rn.f32 f512, f494, f393, f497;
sub.f32 f513, f499, f498;
st.shared.v2.f32 [r26+2352], {f512, f513};
barrier.sync 0;
ld.shared.v2.f32 {f514, f515}, [r10];
ld.shared.v2.f32 {f518, f519}, [r10+2744];
ld.shared.v2.f32 {f522, f523}, [r10+5488];
ld.shared.v2.f32 {f526, f527}, [r10+8232];
ld.shared.v2.f32 {f530, f531}, [r10+10976];
ld.shared.v2.f32 {f534, f535}, [r10+13720];
ld.shared.v2.f32 {f538, f539}, [r10+16464];
add.f32 f542, f518, f538;
add.f32 f543, f514, f542;
add.f32 f544, f522, f534;
add.f32 f545, f544, f543;
add.f32 f546, f526, f530;
add.f32 f547, f519, f539;
add.f32 f548, f515, f547;
add.f32 f549, f523, f535;
add.f32 f550, f549, f548;
add.f32 f551, f527, f531;
fma.rn.f32 f552, f542, 0f3F1F9D07, f514;
mul.f32 f553, f544, 0f3E63DC87;
sub.f32 f554, f552, f553;
mul.f32 f555, f546, 0f3F66A5E5;
sub.f32 f556, f554, f555;
sub.f32 f557, f519, f539;
mul.f32 f558, f557, 0f3F48261C;
sub.f32 f559, f523, f535;
fma.rn.f32 f560, f559, 0f3F7994E0, f558;
sub.f32 f561, f527, f531;
fma.rn.f32 f562, f561, 0f3EDE2602, f560;
mul.f32 f563, f542, 0f3E63DC87;
sub.f32 f564, f514, f563;
mul.f32 f565, f544, 0f3F66A5E5;
sub.f32 f566, f564, f565;
fma.rn.f32 f567, f546, 0f3F1F9D07, f566;
mul.f32 f568, f557, 0f3F7994E0;
mul.f32 f569, f559, 0f3EDE2602;
sub.f32 f570, f568, f569;
mul.f32 f571, f561, 0f3F48261C;
sub.f32 f572, f570, f571;
mul.f32 f573, f542, 0f3F66A5E5;
sub.f32 f574, f514, f573;
fma.rn.f32 f575, f544, 0f3F1F9D07, f574;
mul.f32 f576, f546, 0f3E63DC87;
sub.f32 f577, f575, f576;
mul.f32 f578, f557, 0f3EDE2602;
mul.f32 f579, f559, 0f3F48261C;
sub.f32 f580, f578, f579;
fma.rn.f32 f581, f561, 0f3F7994E0, f580;
fma.rn.f32 f582, f547, 0f3F1F9D07, f515;
mul.f32 f583, f549, 0f3E63DC87;
sub.f32 f584, f582, f583;
mul.f32 f585, f551, 0f3F66A5E5;
sub.f32 f586, f584, f585;
sub.f32 f587, f518, f538;
mul.f32 f588, f587, 0f3F48261C;
sub.f32 f589, f522, f534;
fma.rn.f32 f590, f589, 0f3F7994E0, f588;
sub.f32 f591, f526, f530;
fma.rn.f32 f592, f591, 0f3EDE2602, f590;
mul.f32 f593, f547, 0f3E63DC87;
sub.f32 f594, f515, f593;
mul.f32 f595, f549, 0f3F66A5E5;
sub.f32 f596, f594, f595;
fma.rn.f32 f597, f551, 0f3F1F9D07, f596;
mul.f32 f598, f587, 0f3F7994E0;
mul.f32 f599, f589, 0f3EDE2602;
sub.f32 f600, f598, f599;
mul.f32 f601, f591, 0f3F48261C;
sub.f32 f602, f600, f601;
mul.f32 f603, f547, 0f3F66A5E5;
sub.f32 f604, f515, f603;
fma.rn.f32 f605, f549, 0f3F1F9D07, f604;
mul.f32 f606, f551, 0f3E63DC87;
sub.f32 f607, f605, f606;
mul.f32 f608, f587, 0f3EDE2602;
mul.f32 f609, f589, 0f3F48261C;
sub.f32 f610, f608, f609;
fma.rn.f32 f611, f591, 0f3F7994E0, f610;
add.f32 %1, f551, f550;
add.f32 %0, f546, f545;
add.f32 %3, f592, f586;
sub.f32 %2, f556, f562;
add.f32 %5, f602, f597;
sub.f32 %4, f567, f572;
add.f32 %7, f611, f607;
sub.f32 %6, f577, f581;
sub.f32 %9, f607, f611;
add.f32 %8, f581, f577;
sub.f32 %11, f597, f602;
add.f32 %10, f572, f567;
sub.f32 %13, f586, f592;
add.f32 %12, f562, f556;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_2401), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<379, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<584>;
.reg .b32 r<27>;
.reg .b64 rd<17>;
mov.u32 r1, %tid.y;
mov.u32 r2, %14;
mad.lo.s32 r3, r1, 9604, r2;
mov.u32 r4, %tid.x;
add.f32 f29, %20, %34;
add.f32 f30, %18, f29;
add.f32 f31, %23, %31;
add.f32 f32, f31, f30;
add.f32 f33, %26, %28;
add.f32 f34, f33, f32;
add.f32 f35, %22, %35;
add.f32 f36, %19, f35;
add.f32 f37, %25, %33;
add.f32 f38, f37, f36;
add.f32 f39, %27, %30;
add.f32 f40, f39, f38;
fma.rn.f32 f41, f29, 0f3F1F9D07, %18;
mul.f32 f42, f31, 0f3E63DC87;
sub.f32 f43, f41, f42;
mul.f32 f44, f33, 0f3F66A5E5;
sub.f32 f45, f43, f44;
sub.f32 f46, %22, %35;
mul.f32 f47, f46, 0f3F48261C;
sub.f32 f48, %25, %33;
fma.rn.f32 f49, f48, 0f3F7994E0, f47;
sub.f32 f50, %27, %30;
fma.rn.f32 f51, f50, 0f3EDE2602, f49;
sub.f32 f52, f45, f51;
add.f32 f53, f51, f45;
mul.f32 f54, f29, 0f3E63DC87;
sub.f32 f55, %18, f54;
mul.f32 f56, f31, 0f3F66A5E5;
sub.f32 f57, f55, f56;
fma.rn.f32 f58, f33, 0f3F1F9D07, f57;
mul.f32 f59, f46, 0f3F7994E0;
mul.f32 f60, f48, 0f3EDE2602;
sub.f32 f61, f59, f60;
mul.f32 f62, f50, 0f3F48261C;
sub.f32 f63, f61, f62;
sub.f32 f64, f58, f63;
add.f32 f65, f63, f58;
mul.f32 f66, f29, 0f3F66A5E5;
sub.f32 f67, %18, f66;
fma.rn.f32 f68, f31, 0f3F1F9D07, f67;
mul.f32 f69, f33, 0f3E63DC87;
sub.f32 f70, f68, f69;
mul.f32 f71, f46, 0f3EDE2602;
mul.f32 f72, f48, 0f3F48261C;
sub.f32 f73, f71, f72;
fma.rn.f32 f74, f50, 0f3F7994E0, f73;
sub.f32 f75, f70, f74;
add.f32 f76, f74, f70;
fma.rn.f32 f77, f35, 0f3F1F9D07, %19;
mul.f32 f78, f37, 0f3E63DC87;
sub.f32 f79, f77, f78;
mul.f32 f80, f39, 0f3F66A5E5;
sub.f32 f81, f79, f80;
sub.f32 f82, %20, %34;
mul.f32 f83, f82, 0f3F48261C;
sub.f32 f84, %23, %31;
fma.rn.f32 f85, f84, 0f3F7994E0, f83;
sub.f32 f86, %26, %28;
fma.rn.f32 f87, f86, 0f3EDE2602, f85;
add.f32 f88, f87, f81;
sub.f32 f89, f81, f87;
mul.f32 f90, f35, 0f3E63DC87;
sub.f32 f91, %19, f90;
mul.f32 f92, f37, 0f3F66A5E5;
sub.f32 f93, f91, f92;
fma.rn.f32 f94, f39, 0f3F1F9D07, f93;
mul.f32 f95, f82, 0f3F7994E0;
mul.f32 f96, f84, 0f3EDE2602;
sub.f32 f97, f95, f96;
mul.f32 f98, f86, 0f3F48261C;
sub.f32 f99, f97, f98;
add.f32 f100, f99, f94;
sub.f32 f101, f94, f99;
mul.f32 f102, f35, 0f3F66A5E5;
sub.f32 f103, %19, f102;
fma.rn.f32 f104, f37, 0f3F1F9D07, f103;
mul.f32 f105, f39, 0f3E63DC87;
sub.f32 f106, f104, f105;
mul.f32 f107, f82, 0f3EDE2602;
mul.f32 f108, f84, 0f3F48261C;
sub.f32 f109, f107, f108;
fma.rn.f32 f110, f86, 0f3F7994E0, f109;
add.f32 f111, f110, f106;
sub.f32 f112, f106, f110;
mul.wide.u32 rd2, r4, -1089394037;
shr.u64 rd3, rd2, 40;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 343;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %15;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f113, f114}, [rd6];
mul.f32 f117, f88, f114;
fma.rn.f32 f118, f113, f52, f117;
mul.f32 f119, f52, f114;
mul.f32 f120, f113, f88;
sub.f32 f121, f120, f119;
mul.f32 f122, f113, f113;
mul.f32 f123, f114, f114;
sub.f32 f124, f122, f123;
mul.f32 f125, f114, f113;
fma.rn.f32 f126, f114, f113, f125;
mul.f32 f127, f100, f126;
fma.rn.f32 f128, f124, f64, f127;
mul.f32 f129, f64, f126;
mul.f32 f130, f124, f100;
sub.f32 f131, f130, f129;
mul.f32 f132, f113, f124;
mul.f32 f133, f114, f126;
sub.f32 f134, f132, f133;
mul.f32 f135, f113, f126;
fma.rn.f32 f136, f114, f124, f135;
mul.f32 f137, f111, f136;
fma.rn.f32 f138, f134, f75, f137;
mul.f32 f139, f75, f136;
mul.f32 f140, f134, f111;
sub.f32 f141, f140, f139;
mul.f32 f142, f113, f134;
mul.f32 f143, f114, f136;
sub.f32 f144, f142, f143;
mul.f32 f145, f113, f136;
fma.rn.f32 f146, f114, f134, f145;
mul.f32 f147, f112, f146;
fma.rn.f32 f148, f144, f76, f147;
mul.f32 f149, f76, f146;
mul.f32 f150, f144, f112;
sub.f32 f151, f150, f149;
mul.f32 f152, f113, f144;
mul.f32 f153, f114, f146;
sub.f32 f154, f152, f153;
mul.f32 f155, f113, f146;
fma.rn.f32 f156, f114, f144, f155;
mul.f32 f157, f101, f156;
fma.rn.f32 f158, f154, f65, f157;
mul.f32 f159, f65, f156;
mul.f32 f160, f154, f101;
sub.f32 f161, f160, f159;
mul.f32 f162, f113, f154;
mul.f32 f163, f114, f156;
sub.f32 f164, f162, f163;
mul.f32 f165, f113, f156;
fma.rn.f32 f166, f114, f154, f165;
mul.f32 f167, f89, f166;
fma.rn.f32 f168, f164, f53, f167;
mul.f32 f169, f53, f166;
mul.f32 f170, f164, f89;
sub.f32 f171, f170, f169;
mad.lo.s32 r8, r5, 9604, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 28, r8;
st.shared.f32 [r9], f34;
st.shared.f32 [r9+4], f118;
st.shared.f32 [r9+8], f128;
st.shared.f32 [r9+12], f138;
st.shared.f32 [r9+16], f148;
st.shared.f32 [r9+20], f158;
st.shared.f32 [r9+24], f168;
barrier.sync 0;
mad.lo.s32 r10, r7, -24, r9;
ld.shared.f32 f172, [r10];
ld.shared.f32 f173, [r10+1372];
ld.shared.f32 f174, [r10+2744];
ld.shared.f32 f175, [r10+4116];
ld.shared.f32 f176, [r10+5488];
ld.shared.f32 f177, [r10+6860];
ld.shared.f32 f178, [r10+8232];
barrier.sync 0;
st.shared.f32 [r9], f40;
st.shared.f32 [r9+4], f121;
st.shared.f32 [r9+8], f131;
st.shared.f32 [r9+12], f141;
st.shared.f32 [r9+16], f151;
st.shared.f32 [r9+20], f161;
st.shared.f32 [r9+24], f171;
barrier.sync 0;
ld.shared.f32 f179, [r10];
ld.shared.f32 f180, [r10+1372];
ld.shared.f32 f181, [r10+2744];
ld.shared.f32 f182, [r10+4116];
ld.shared.f32 f183, [r10+5488];
ld.shared.f32 f184, [r10+6860];
ld.shared.f32 f185, [r10+8232];
add.f32 f186, f173, f178;
add.f32 f187, f172, f186;
add.f32 f188, f174, f177;
add.f32 f189, f188, f187;
add.f32 f190, f175, f176;
add.f32 f191, f190, f189;
add.f32 f192, f180, f185;
add.f32 f193, f179, f192;
add.f32 f194, f181, f184;
add.f32 f195, f194, f193;
add.f32 f196, f182, f183;
add.f32 f197, f196, f195;
fma.rn.f32 f198, f186, 0f3F1F9D07, f172;
mul.f32 f199, f188, 0f3E63DC87;
sub.f32 f200, f198, f199;
mul.f32 f201, f190, 0f3F66A5E5;
sub.f32 f202, f200, f201;
sub.f32 f203, f180, f185;
mul.f32 f204, f203, 0f3F48261C;
sub.f32 f205, f181, f184;
fma.rn.f32 f206, f205, 0f3F7994E0, f204;
sub.f32 f207, f182, f183;
fma.rn.f32 f208, f207, 0f3EDE2602, f206;
sub.f32 f209, f202, f208;
add.f32 f210, f208, f202;
mul.f32 f211, f186, 0f3E63DC87;
sub.f32 f212, f172, f211;
mul.f32 f213, f188, 0f3F66A5E5;
sub.f32 f214, f212, f213;
fma.rn.f32 f215, f190, 0f3F1F9D07, f214;
mul.f32 f216, f203, 0f3F7994E0;
mul.f32 f217, f205, 0f3EDE2602;
sub.f32 f218, f216, f217;
mul.f32 f219, f207, 0f3F48261C;
sub.f32 f220, f218, f219;
sub.f32 f221, f215, f220;
add.f32 f222, f220, f215;
mul.f32 f223, f186, 0f3F66A5E5;
sub.f32 f224, f172, f223;
fma.rn.f32 f225, f188, 0f3F1F9D07, f224;
mul.f32 f226, f190, 0f3E63DC87;
sub.f32 f227, f225, f226;
mul.f32 f228, f203, 0f3EDE2602;
mul.f32 f229, f205, 0f3F48261C;
sub.f32 f230, f228, f229;
fma.rn.f32 f231, f207, 0f3F7994E0, f230;
sub.f32 f232, f227, f231;
add.f32 f233, f231, f227;
fma.rn.f32 f234, f192, 0f3F1F9D07, f179;
mul.f32 f235, f194, 0f3E63DC87;
sub.f32 f236, f234, f235;
mul.f32 f237, f196, 0f3F66A5E5;
sub.f32 f238, f236, f237;
sub.f32 f239, f173, f178;
mul.f32 f240, f239, 0f3F48261C;
sub.f32 f241, f174, f177;
fma.rn.f32 f242, f241, 0f3F7994E0, f240;
sub.f32 f243, f175, f176;
fma.rn.f32 f244, f243, 0f3EDE2602, f242;
add.f32 f245, f244, f238;
sub.f32 f246, f238, f244;
mul.f32 f247, f192, 0f3E63DC87;
sub.f32 f248, f179, f247;
mul.f32 f249, f194, 0f3F66A5E5;
sub.f32 f250, f248, f249;
fma.rn.f32 f251, f196, 0f3F1F9D07, f250;
mul.f32 f252, f239, 0f3F7994E0;
mul.f32 f253, f241, 0f3EDE2602;
sub.f32 f254, f252, f253;
mul.f32 f255, f243, 0f3F48261C;
sub.f32 f256, f254, f255;
add.f32 f257, f256, f251;
sub.f32 f258, f251, f256;
mul.f32 f259, f192, 0f3F66A5E5;
sub.f32 f260, f179, f259;
fma.rn.f32 f261, f194, 0f3F1F9D07, f260;
mul.f32 f262, f196, 0f3E63DC87;
sub.f32 f263, f261, f262;
mul.f32 f264, f239, 0f3EDE2602;
mul.f32 f265, f241, 0f3F48261C;
sub.f32 f266, f264, f265;
fma.rn.f32 f267, f243, 0f3F7994E0, f266;
add.f32 f268, f267, f263;
sub.f32 f269, f263, f267;
mul.wide.u32 rd7, r7, 613566757;
shr.u64 rd8, rd7, 32;
cvt.u32.u64 r11, rd8;
sub.s32 r12, r7, r11;
shr.u32 r13, r12, 1;
add.s32 r14, r13, r11;
shr.u32 r15, r14, 2;
mul.lo.s32 r16, r15, 7;
sub.s32 r17, r7, r16;
mul.wide.u32 rd9, r15, 8;
mov.u64 rd10, %16;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f270, f271}, [rd11];
mul.f32 f274, f245, f271;
fma.rn.f32 f275, f270, f209, f274;
mul.f32 f276, f209, f271;
mul.f32 f277, f270, f245;
sub.f32 f278, f277, f276;
mul.f32 f279, f270, f270;
mul.f32 f280, f271, f271;
sub.f32 f281, f279, f280;
mul.f32 f282, f271, f270;
fma.rn.f32 f283, f271, f270, f282;
mul.f32 f284, f257, f283;
fma.rn.f32 f285, f281, f221, f284;
mul.f32 f286, f221, f283;
mul.f32 f287, f281, f257;
sub.f32 f288, f287, f286;
mul.f32 f289, f270, f281;
mul.f32 f290, f271, f283;
sub.f32 f291, f289, f290;
mul.f32 f292, f270, f283;
fma.rn.f32 f293, f271, f281, f292;
mul.f32 f294, f268, f293;
fma.rn.f32 f295, f291, f232, f294;
mul.f32 f296, f232, f293;
mul.f32 f297, f291, f268;
sub.f32 f298, f297, f296;
mul.f32 f299, f270, f291;
mul.f32 f300, f271, f293;
sub.f32 f301, f299, f300;
mul.f32 f302, f270, f293;
fma.rn.f32 f303, f271, f291, f302;
mul.f32 f304, f269, f303;
fma.rn.f32 f305, f301, f233, f304;
mul.f32 f306, f233, f303;
mul.f32 f307, f301, f269;
sub.f32 f308, f307, f306;
mul.f32 f309, f270, f301;
mul.f32 f310, f271, f303;
sub.f32 f311, f309, f310;
mul.f32 f312, f270, f303;
fma.rn.f32 f313, f271, f301, f312;
mul.f32 f314, f258, f313;
fma.rn.f32 f315, f311, f222, f314;
mul.f32 f316, f222, f313;
mul.f32 f317, f311, f258;
sub.f32 f318, f317, f316;
mul.f32 f319, f270, f311;
mul.f32 f320, f271, f313;
sub.f32 f321, f319, f320;
mul.f32 f322, f270, f313;
fma.rn.f32 f323, f271, f311, f322;
mul.f32 f324, f246, f323;
fma.rn.f32 f325, f321, f210, f324;
mul.f32 f326, f210, f323;
mul.f32 f327, f321, f246;
sub.f32 f328, f327, f326;
shl.b32 r18, r17, 2;
add.s32 r19, r8, r18;
barrier.sync 0;
mad.lo.s32 r20, r15, 196, r19;
st.shared.f32 [r20], f191;
st.shared.f32 [r20+28], f275;
st.shared.f32 [r20+56], f285;
st.shared.f32 [r20+84], f295;
st.shared.f32 [r20+112], f305;
st.shared.f32 [r20+140], f315;
st.shared.f32 [r20+168], f325;
barrier.sync 0;
ld.shared.f32 f329, [r10];
ld.shared.f32 f330, [r10+1372];
ld.shared.f32 f331, [r10+2744];
ld.shared.f32 f332, [r10+4116];
ld.shared.f32 f333, [r10+5488];
ld.shared.f32 f334, [r10+6860];
ld.shared.f32 f335, [r10+8232];
barrier.sync 0;
st.shared.f32 [r20], f197;
st.shared.f32 [r20+28], f278;
st.shared.f32 [r20+56], f288;
st.shared.f32 [r20+84], f298;
st.shared.f32 [r20+112], f308;
st.shared.f32 [r20+140], f318;
st.shared.f32 [r20+168], f328;
barrier.sync 0;
ld.shared.f32 f336, [r10];
ld.shared.f32 f337, [r10+1372];
ld.shared.f32 f338, [r10+2744];
ld.shared.f32 f339, [r10+4116];
ld.shared.f32 f340, [r10+5488];
ld.shared.f32 f341, [r10+6860];
ld.shared.f32 f342, [r10+8232];
add.f32 f343, f330, f335;
add.f32 f344, f329, f343;
add.f32 f345, f331, f334;
add.f32 f346, f345, f344;
add.f32 f347, f332, f333;
add.f32 f348, f347, f346;
add.f32 f349, f337, f342;
add.f32 f350, f336, f349;
add.f32 f351, f338, f341;
add.f32 f352, f351, f350;
add.f32 f353, f339, f340;
add.f32 f354, f353, f352;
fma.rn.f32 f355, f343, 0f3F1F9D07, f329;
mul.f32 f356, f345, 0f3E63DC87;
sub.f32 f357, f355, f356;
mul.f32 f358, f347, 0f3F66A5E5;
sub.f32 f359, f357, f358;
sub.f32 f360, f337, f342;
mul.f32 f361, f360, 0f3F48261C;
sub.f32 f362, f338, f341;
fma.rn.f32 f363, f362, 0f3F7994E0, f361;
sub.f32 f364, f339, f340;
fma.rn.f32 f365, f364, 0f3EDE2602, f363;
sub.f32 f366, f359, f365;
add.f32 f367, f365, f359;
mul.f32 f368, f343, 0f3E63DC87;
sub.f32 f369, f329, f368;
mul.f32 f370, f345, 0f3F66A5E5;
sub.f32 f371, f369, f370;
fma.rn.f32 f372, f347, 0f3F1F9D07, f371;
mul.f32 f373, f360, 0f3F7994E0;
mul.f32 f374, f362, 0f3EDE2602;
sub.f32 f375, f373, f374;
mul.f32 f376, f364, 0f3F48261C;
sub.f32 f377, f375, f376;
sub.f32 f378, f372, f377;
add.f32 f379, f377, f372;
mul.f32 f380, f343, 0f3F66A5E5;
sub.f32 f381, f329, f380;
fma.rn.f32 f382, f345, 0f3F1F9D07, f381;
mul.f32 f383, f347, 0f3E63DC87;
sub.f32 f384, f382, f383;
mul.f32 f385, f360, 0f3EDE2602;
mul.f32 f386, f362, 0f3F48261C;
sub.f32 f387, f385, f386;
fma.rn.f32 f388, f364, 0f3F7994E0, f387;
sub.f32 f389, f384, f388;
add.f32 f390, f388, f384;
fma.rn.f32 f391, f349, 0f3F1F9D07, f336;
mul.f32 f392, f351, 0f3E63DC87;
sub.f32 f393, f391, f392;
mul.f32 f394, f353, 0f3F66A5E5;
sub.f32 f395, f393, f394;
sub.f32 f396, f330, f335;
mul.f32 f397, f396, 0f3F48261C;
sub.f32 f398, f331, f334;
fma.rn.f32 f399, f398, 0f3F7994E0, f397;
sub.f32 f400, f332, f333;
fma.rn.f32 f401, f400, 0f3EDE2602, f399;
add.f32 f402, f401, f395;
sub.f32 f403, f395, f401;
mul.f32 f404, f349, 0f3E63DC87;
sub.f32 f405, f336, f404;
mul.f32 f406, f351, 0f3F66A5E5;
sub.f32 f407, f405, f406;
fma.rn.f32 f408, f353, 0f3F1F9D07, f407;
mul.f32 f409, f396, 0f3F7994E0;
mul.f32 f410, f398, 0f3EDE2602;
sub.f32 f411, f409, f410;
mul.f32 f412, f400, 0f3F48261C;
sub.f32 f413, f411, f412;
add.f32 f414, f413, f408;
sub.f32 f415, f408, f413;
mul.f32 f416, f349, 0f3F66A5E5;
sub.f32 f417, f336, f416;
fma.rn.f32 f418, f351, 0f3F1F9D07, f417;
mul.f32 f419, f353, 0f3E63DC87;
sub.f32 f420, f418, f419;
mul.f32 f421, f396, 0f3EDE2602;
mul.f32 f422, f398, 0f3F48261C;
sub.f32 f423, f421, f422;
fma.rn.f32 f424, f400, 0f3F7994E0, f423;
add.f32 f425, f424, f420;
sub.f32 f426, f420, f424;
mul.wide.u32 rd12, r7, 1402438301;
shr.u64 rd13, rd12, 36;
cvt.u32.u64 r21, rd13;
mul.lo.s32 r22, r21, 49;
sub.s32 r23, r7, r22;
mul.wide.u32 rd14, r21, 8;
mov.u64 rd15, %17;
add.s64 rd16, rd15, rd14;
ld.global.v2.f32 {f427, f428}, [rd16];
mul.f32 f431, f402, f428;
fma.rn.f32 f432, f427, f366, f431;
mul.f32 f433, f366, f428;
mul.f32 f434, f427, f402;
sub.f32 f435, f434, f433;
mul.f32 f436, f427, f427;
mul.f32 f437, f428, f428;
sub.f32 f438, f436, f437;
mul.f32 f439, f428, f427;
fma.rn.f32 f440, f428, f427, f439;
mul.f32 f441, f414, f440;
fma.rn.f32 f442, f438, f378, f441;
mul.f32 f443, f378, f440;
mul.f32 f444, f438, f414;
sub.f32 f445, f444, f443;
mul.f32 f446, f427, f438;
mul.f32 f447, f428, f440;
sub.f32 f448, f446, f447;
mul.f32 f449, f427, f440;
fma.rn.f32 f450, f428, f438, f449;
mul.f32 f451, f425, f450;
fma.rn.f32 f452, f448, f389, f451;
mul.f32 f453, f389, f450;
mul.f32 f454, f448, f425;
sub.f32 f455, f454, f453;
mul.f32 f456, f427, f448;
mul.f32 f457, f428, f450;
sub.f32 f458, f456, f457;
mul.f32 f459, f427, f450;
fma.rn.f32 f460, f428, f448, f459;
mul.f32 f461, f426, f460;
fma.rn.f32 f462, f458, f390, f461;
mul.f32 f463, f390, f460;
mul.f32 f464, f458, f426;
sub.f32 f465, f464, f463;
mul.f32 f466, f427, f458;
mul.f32 f467, f428, f460;
sub.f32 f468, f466, f467;
mul.f32 f469, f427, f460;
fma.rn.f32 f470, f428, f458, f469;
mul.f32 f471, f415, f470;
fma.rn.f32 f472, f468, f379, f471;
mul.f32 f473, f379, f470;
mul.f32 f474, f468, f415;
sub.f32 f475, f474, f473;
mul.f32 f476, f427, f468;
mul.f32 f477, f428, f470;
sub.f32 f478, f476, f477;
mul.f32 f479, f427, f470;
fma.rn.f32 f480, f428, f468, f479;
mul.f32 f481, f403, f480;
fma.rn.f32 f482, f478, f367, f481;
mul.f32 f483, f367, f480;
mul.f32 f484, f478, f403;
sub.f32 f485, f484, f483;
shl.b32 r24, r23, 2;
add.s32 r25, r8, r24;
barrier.sync 0;
mad.lo.s32 r26, r21, 1372, r25;
st.shared.f32 [r26], f348;
st.shared.f32 [r26+196], f432;
st.shared.f32 [r26+392], f442;
st.shared.f32 [r26+588], f452;
st.shared.f32 [r26+784], f462;
st.shared.f32 [r26+980], f472;
st.shared.f32 [r26+1176], f482;
barrier.sync 0;
ld.shared.f32 f486, [r10];
ld.shared.f32 f487, [r10+1372];
ld.shared.f32 f488, [r10+2744];
ld.shared.f32 f489, [r10+4116];
ld.shared.f32 f490, [r10+5488];
ld.shared.f32 f491, [r10+6860];
ld.shared.f32 f492, [r10+8232];
barrier.sync 0;
st.shared.f32 [r26], f354;
st.shared.f32 [r26+196], f435;
st.shared.f32 [r26+392], f445;
st.shared.f32 [r26+588], f455;
st.shared.f32 [r26+784], f465;
st.shared.f32 [r26+980], f475;
st.shared.f32 [r26+1176], f485;
barrier.sync 0;
ld.shared.f32 f493, [r10];
ld.shared.f32 f494, [r10+1372];
ld.shared.f32 f495, [r10+2744];
ld.shared.f32 f496, [r10+4116];
ld.shared.f32 f497, [r10+5488];
ld.shared.f32 f498, [r10+6860];
ld.shared.f32 f499, [r10+8232];
add.f32 f500, f487, f492;
add.f32 f501, f486, f500;
add.f32 f502, f488, f491;
add.f32 f503, f502, f501;
add.f32 f504, f489, f490;
add.f32 f505, f494, f499;
add.f32 f506, f493, f505;
add.f32 f507, f495, f498;
add.f32 f508, f507, f506;
add.f32 f509, f496, f497;
fma.rn.f32 f510, f500, 0f3F1F9D07, f486;
mul.f32 f511, f502, 0f3E63DC87;
sub.f32 f512, f510, f511;
mul.f32 f513, f504, 0f3F66A5E5;
sub.f32 f514, f512, f513;
sub.f32 f515, f494, f499;
mul.f32 f516, f515, 0f3F48261C;
sub.f32 f517, f495, f498;
fma.rn.f32 f518, f517, 0f3F7994E0, f516;
sub.f32 f519, f496, f497;
fma.rn.f32 f520, f519, 0f3EDE2602, f518;
mul.f32 f521, f500, 0f3E63DC87;
sub.f32 f522, f486, f521;
mul.f32 f523, f502, 0f3F66A5E5;
sub.f32 f524, f522, f523;
fma.rn.f32 f525, f504, 0f3F1F9D07, f524;
mul.f32 f526, f515, 0f3F7994E0;
mul.f32 f527, f517, 0f3EDE2602;
sub.f32 f528, f526, f527;
mul.f32 f529, f519, 0f3F48261C;
sub.f32 f530, f528, f529;
mul.f32 f531, f500, 0f3F66A5E5;
sub.f32 f532, f486, f531;
fma.rn.f32 f533, f502, 0f3F1F9D07, f532;
mul.f32 f534, f504, 0f3E63DC87;
sub.f32 f535, f533, f534;
mul.f32 f536, f515, 0f3EDE2602;
mul.f32 f537, f517, 0f3F48261C;
sub.f32 f538, f536, f537;
fma.rn.f32 f539, f519, 0f3F7994E0, f538;
fma.rn.f32 f540, f505, 0f3F1F9D07, f493;
mul.f32 f541, f507, 0f3E63DC87;
sub.f32 f542, f540, f541;
mul.f32 f543, f509, 0f3F66A5E5;
sub.f32 f544, f542, f543;
sub.f32 f545, f487, f492;
mul.f32 f546, f545, 0f3F48261C;
sub.f32 f547, f488, f491;
fma.rn.f32 f548, f547, 0f3F7994E0, f546;
sub.f32 f549, f489, f490;
fma.rn.f32 f550, f549, 0f3EDE2602, f548;
mul.f32 f551, f505, 0f3E63DC87;
sub.f32 f552, f493, f551;
mul.f32 f553, f507, 0f3F66A5E5;
sub.f32 f554, f552, f553;
fma.rn.f32 f555, f509, 0f3F1F9D07, f554;
mul.f32 f556, f545, 0f3F7994E0;
mul.f32 f557, f547, 0f3EDE2602;
sub.f32 f558, f556, f557;
mul.f32 f559, f549, 0f3F48261C;
sub.f32 f560, f558, f559;
mul.f32 f561, f505, 0f3F66A5E5;
sub.f32 f562, f493, f561;
fma.rn.f32 f563, f507, 0f3F1F9D07, f562;
mul.f32 f564, f509, 0f3E63DC87;
sub.f32 f565, f563, f564;
mul.f32 f566, f545, 0f3EDE2602;
mul.f32 f567, f547, 0f3F48261C;
sub.f32 f568, f566, f567;
fma.rn.f32 f569, f549, 0f3F7994E0, f568;
add.f32 %0, f504, f503;
add.f32 %1, f509, f508;
add.f32 %3, f550, f544;
sub.f32 %2, f514, f520;
sub.f32 %4, f525, f530;
add.f32 %5, f560, f555;
sub.f32 %6, f535, f539;
add.f32 %7, f569, f565;
add.f32 %8, f539, f535;
sub.f32 %9, f565, f569;
add.f32 %10, f530, f525;
sub.f32 %11, f555, f560;
sub.f32 %13, f544, f550;
add.f32 %12, f520, f514;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_2401), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y));
};


#endif
